library(readr)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(car)
## Loading required package: carData
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(caret)
## Loading required package: lattice
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(gvlma)
library(rsq)
library(readr)
library(readr)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:Metrics':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(HSAUR2)
library(SciViews)
library(scatterplot3d)
library(car)
library(lattice)
library(GGally)
library(caTools)
library(ggplot2)
library(ggridges)
library(ggvis)
## 
## Attaching package: 'ggvis'
## The following object is masked from 'package:Matrix':
## 
##     band
## The following object is masked from 'package:ggplot2':
## 
##     resolution
library(ggcorrplot)
library(ggthemes)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(gapminder)
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
## 
## Attaching package: 'gganimate'
## The following object is masked from 'package:ggvis':
## 
##     view_static
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble  3.2.1     ✔ stringr 1.5.0
## ✔ tidyr   1.3.0     ✔ forcats 1.0.0
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand()     masks Matrix::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ purrr::lift()       masks caret::lift()
## ✖ tidyr::pack()       masks Matrix::pack()
## ✖ dplyr::recode()     masks car::recode()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ purrr::some()       masks car::some()
## ✖ tidyr::unpack()     masks Matrix::unpack()
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(RColorBrewer)
library(Hotelling)
## Loading required package: corpcor
## 
## Attaching package: 'Hotelling'
## 
## The following object is masked from 'package:dplyr':
## 
##     summarise
library(stats)
library(biotools)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.2
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(cluster)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
library(MASS)
library(ggplot2)
library(memisc)
## 
## Attaching package: 'memisc'
## 
## The following object is masked from 'package:purrr':
## 
##     %@%
## 
## The following object is masked from 'package:tibble':
## 
##     view
## 
## The following objects are masked from 'package:dplyr':
## 
##     collect, recode, rename, syms
## 
## The following object is masked from 'package:Matrix':
## 
##     as.array
## 
## The following object is masked from 'package:car':
## 
##     recode
## 
## The following object is masked from 'package:ggplot2':
## 
##     syms
## 
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## 
## The following object is masked from 'package:base':
## 
##     as.array
library(ROCR)
library(dplyr)
library(klaR)
library(readxl)
abalone <- read_excel("~/Documents/Courses/Multivariate Analysis/Final/Final_MVA_2023.xlsx")
summary(abalone)
##     Gender              Length          Diameter          Height      
##  Length:2835        Min.   :0.1550   Min.   :0.1100   Min.   :0.0150  
##  Class :character   1st Qu.:0.5150   1st Qu.:0.4000   1st Qu.:0.1350  
##  Mode  :character   Median :0.5850   Median :0.4600   Median :0.1550  
##                     Mean   :0.5696   Mean   :0.4464   Mean   :0.1544  
##                     3rd Qu.:0.6350   3rd Qu.:0.5000   3rd Qu.:0.1750  
##                     Max.   :0.8150   Max.   :0.6500   Max.   :1.1300  
##   Whole Weight    Shucked Weight   Viscera weight    Shell weight   
##  Min.   :0.0155   Min.   :0.0065   Min.   :0.0030   Min.   :0.0050  
##  1st Qu.:0.7013   1st Qu.:0.2870   1st Qu.:0.1520   1st Qu.:0.2025  
##  Median :1.0030   Median :0.4315   Median :0.2170   Median :0.2850  
##  Mean   :1.0168   Mean   :0.4391   Mean   :0.2225   Mean   :0.2912  
##  3rd Qu.:1.2895   3rd Qu.:0.5687   3rd Qu.:0.2875   3rd Qu.:0.3650  
##  Max.   :2.8255   Max.   :1.4880   Max.   :0.7600   Max.   :1.0050  
##      Rings     
##  Min.   : 3.0  
##  1st Qu.: 9.0  
##  Median :10.0  
##  Mean   :10.9  
##  3rd Qu.:12.0  
##  Max.   :29.0
abalone$Age <- abalone$Rings + 1.5
View(abalone)
colSums(is.na(abalone))
##         Gender         Length       Diameter         Height   Whole Weight 
##              0              0              0              0              0 
## Shucked Weight Viscera weight   Shell weight          Rings            Age 
##              0              0              0              0              0

Creating dummy vars for gender

dummy_df <- dummyVars("~ Gender", data = abalone)

# Apply the dummy variables to the original data frame
transformed_df <- predict(dummy_df, newdata = abalone[1])

# View the transformed data frame
#head(transformed_df)

df <- cbind(abalone, transformed_df)
df = df[, -1]
View(df)

Correlation plot

ggpairs(data=df, title="Abalone")

What attributes proved valuable in predicting the age? 1. Shell weight 2. Height 3. Diameter These values are the most valuable for predicting age. There is also rings with a 1.00 correlation because it is just rings + 1.5 so that does not count.

Taking log of the data as the data is non linear in nature

ablog <- abalone
# Take the log of the dataset
ablog$Length <- log(ablog$Length)
ablog$Diameter <- log(ablog$Diameter)
ablog$Height <- log(ablog$Height)
ablog$`Whole Weight` <- log(ablog$`Whole Weight`)
ablog$`Shucked Weight` <- log(ablog$`Shucked Weight`)
ablog$`Viscera weight` <- log(ablog$`Viscera weight`)
ablog$`Shell weight` <- log(ablog$`Shell weight`)
ablog$Age <- log(ablog$Age)
ablog$Rings <- log(ablog$Rings)
summary(ablog)
##     Gender              Length           Diameter           Height       
##  Length:2835        Min.   :-1.8643   Min.   :-2.2073   Min.   :-4.1997  
##  Class :character   1st Qu.:-0.6636   1st Qu.:-0.9163   1st Qu.:-2.0025  
##  Mode  :character   Median :-0.5361   Median :-0.7765   Median :-1.8643  
##                     Mean   :-0.5797   Mean   :-0.8253   Mean   :-1.8948  
##                     3rd Qu.:-0.4541   3rd Qu.:-0.6931   3rd Qu.:-1.7430  
##                     Max.   :-0.2046   Max.   :-0.4308   Max.   : 0.1222  
##   Whole Weight       Shucked Weight    Viscera weight     Shell weight      
##  Min.   :-4.166915   Min.   :-5.0360   Min.   :-5.8091   Min.   :-5.298317  
##  1st Qu.:-0.354891   1st Qu.:-1.2483   1st Qu.:-1.8839   1st Qu.:-1.597018  
##  Median : 0.002996   Median :-0.8405   Median :-1.5279   Median :-1.255266  
##  Mean   :-0.116366   Mean   :-0.9754   Mean   :-1.6411   Mean   :-1.358663  
##  3rd Qu.: 0.254255   3rd Qu.:-0.5643   3rd Qu.:-1.2465   3rd Qu.:-1.007858  
##  Max.   : 1.038685   Max.   : 0.3974   Max.   :-0.2744   Max.   : 0.004988  
##      Rings            Age       
##  Min.   :1.099   Min.   :1.504  
##  1st Qu.:2.197   1st Qu.:2.351  
##  Median :2.303   Median :2.442  
##  Mean   :2.353   Mean   :2.490  
##  3rd Qu.:2.485   3rd Qu.:2.603  
##  Max.   :3.367   Max.   :3.418
colSums(is.na(ablog))
##         Gender         Length       Diameter         Height   Whole Weight 
##              0              0              0              0              0 
## Shucked Weight Viscera weight   Shell weight          Rings            Age 
##              0              0              0              0              0
dummy_df_log <- dummyVars("~ Gender", data = ablog)

# Apply the dummy variables to the original data frame
transformed_df_log <- predict(dummy_df_log, newdata = ablog[1])

# View the transformed data frame
head(transformed_df_log)
##   GenderF GenderM
## 1       0       1
## 2       0       1
## 3       1       0
## 4       0       1
## 5       1       0
## 6       1       0
df_log <- cbind(ablog, transformed_df_log)
ggpairs(data=df_log, aes(colour = Gender, alpha = 0.8), title="Pairs plot for abalone dataset") + 
  theme_grey(base_size = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_log = df_log[, -1]

Train test split

set.seed(42) # set a seed value for reproducibility
trainIndexlog <- createDataPartition(df_log$Rings, p = 0.7, list = FALSE)
trainDatalog <- df_log[trainIndexlog, ]
testDatalog <- df_log[-trainIndexlog, ]
abalone.reg.log <- lm(Rings~Length + Diameter + Height + `Whole Weight` + `Shucked Weight` + `Viscera weight` + `Shell weight` + GenderF + GenderM, data=trainDatalog)
summary(abalone.reg.log)
## 
## Call:
## lm(formula = Rings ~ Length + Diameter + Height + `Whole Weight` + 
##     `Shucked Weight` + `Viscera weight` + `Shell weight` + GenderF + 
##     GenderM, data = trainDatalog)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53996 -0.12533 -0.01387  0.10732  0.72066 
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.986736   0.122958  16.158  < 2e-16 ***
## Length           -0.375454   0.135945  -2.762 0.005801 ** 
## Diameter          0.152652   0.121675   1.255 0.209778    
## Height            0.059538   0.034662   1.718 0.086010 .  
## `Whole Weight`    0.859043   0.087705   9.795  < 2e-16 ***
## `Shucked Weight` -0.791485   0.043552 -18.173  < 2e-16 ***
## `Viscera weight` -0.117463   0.031819  -3.692 0.000229 ***
## `Shell weight`    0.348638   0.041921   8.317  < 2e-16 ***
## GenderF          -0.004024   0.008590  -0.468 0.639517    
## GenderM                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1885 on 1977 degrees of freedom
## Multiple R-squared:  0.4955, Adjusted R-squared:  0.4934 
## F-statistic: 242.7 on 8 and 1977 DF,  p-value: < 2.2e-16
predictionslog <- predict(abalone.reg.log, newdata=testDatalog)
## Warning in predict.lm(abalone.reg.log, newdata = testDatalog): prediction from
## a rank-deficient fit may be misleading
RMSE(predictionslog, testDatalog[, 8])
## [1] 0.1928701
y_pred <- exp(predictionslog)
y_actual <- exp(testDatalog[, 8])
ape <- abs((y_actual - y_pred) / y_actual) * 100
mape <- mean(ape, na.rm = TRUE)
cat("MAPE:", round(mape, 2), "%")
## MAPE: 15.03 %
plot(abalone.reg.log, which = 1)

plot(abalone.reg.log, which = 2) #Normal Q-Q PLOT

hist(abalone.reg.log$residuals)

Accuracy of the model (MAPE) is 15.03% This accuracy is lower because I have taken log transformation of the entire dataset since it was non linear The model is not a perfect normal disttribution. residual vs fitted plot shows some pattern in the data, as the residuals are increasing as the fitted values are increasing. The qq plot has a tail which indicates that there are some errors. there is scope to improve this model.

Logistic Regression

ggpairs(data=abalone, aes(colour = Gender, alpha = 0.8), title="Pairs plot for abalone dataset") + 
  theme_grey(base_size = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Train test split

set.seed(42) # set a seed value for reproducibility
abalone$Sex_binary <- ifelse(abalone$Gender == "M", 1, 0)
abalone$Sex_binary <- as.factor(abalone$Sex_binary)
train_index1 <- sample(1:nrow(abalone), nrow(abalone)*0.7)
train_data1 <- abalone[train_index1, ]
test_data1 <- abalone[-train_index1, ]

Model

logit_model <- glm(Sex_binary ~ Length + Diameter + Height + `Whole Weight` + `Shucked Weight` + `Viscera weight` + `Shell weight` + Rings, data=train_data1, family=binomial)

Predict

test_data1$predicted_sex <- predict(logit_model, newdata=test_data1, type="response")
test_data1$predicted_sex_binary <- ifelse(test_data1$predicted_sex > 0.5, 1, 0)

Evaluate model

confusionMatrix(as.factor(test_data1$predicted_sex_binary), test_data1$Sex_binary)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 142 127
##          1 251 331
##                                           
##                Accuracy : 0.5558          
##                  95% CI : (0.5217, 0.5895)
##     No Information Rate : 0.5382          
##     P-Value [Acc > NIR] : 0.1594          
##                                           
##                   Kappa : 0.086           
##                                           
##  Mcnemar's Test P-Value : 2.509e-10       
##                                           
##             Sensitivity : 0.3613          
##             Specificity : 0.7227          
##          Pos Pred Value : 0.5279          
##          Neg Pred Value : 0.5687          
##              Prevalence : 0.4618          
##          Detection Rate : 0.1669          
##    Detection Prevalence : 0.3161          
##       Balanced Accuracy : 0.5420          
##                                           
##        'Positive' Class : 0               
## 
precision(test_data1$predicted_sex_binary, test_data1$Sex_binary)
## [1] 0.7227074
recall(test_data1$predicted_sex_binary, test_data1$Sex_binary)
## Warning in mean.default(predicted[actual == 1]): argument is not numeric or
## logical: returning NA
## [1] NA
roc <- roc(test_data1$Gender, test_data1$predicted_sex_binary)
## Setting levels: control = F, case = M
## Setting direction: controls < cases
plot(roc, main = "ROC curve", print.auc = TRUE)

The confusion matrix shows that the model predicted 142 cases as 0 and the actual value was also 0. The model predicted 127 cases as 1, but the actual value was 0. The model predicted 251 cases as 0, but the actual value was 1. Finally, the model predicted 331 cases as 1, and the actual value was also 1.

The accuracy is 55.58% of the logistic regression model. Sensitivity : 0.3613
Specificity : 0.7227
AUC: 0.54 We can predict the gender but it is barely better than a coin flip.

# Build an LDA model
lda_model <- lda(Sex_binary ~ . - Sex_binary, data = train_data1[, 2:11])
## Warning in lda.default(x, grouping, ...): variables are collinear
# Check the model summary
summary(lda_model)
##         Length Class  Mode     
## prior    2     -none- numeric  
## counts   2     -none- numeric  
## means   18     -none- numeric  
## scaling  9     -none- numeric  
## lev      2     -none- character
## svd      1     -none- numeric  
## N        1     -none- numeric  
## call     3     -none- call     
## terms    3     terms  call     
## xlevels  0     -none- list
# Make predictions on the test set
lda_pred <- predict(lda_model, newdata = test_data1[, 2:11])
lda_pred <- lda_pred$class

# Calculate accuracy measures
table(lda_pred, test_data1$Sex_binary)
##         
## lda_pred   0   1
##        0 139 121
##        1 254 337
acc <- sum(lda_pred == test_data1$Sex_binary)/length(test_data1$Sex_binary)
acc
## [1] 0.559342

The accuracy is 55.93% It is barely better than the logistic regression model.

Logistic regression using PCA

pca <- prcomp(abalone[,2:9],scale=TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.4923 0.9697 0.6431 0.42965 0.35503 0.30635 0.14437
## Proportion of Variance 0.7764 0.1175 0.0517 0.02307 0.01576 0.01173 0.00261
## Cumulative Proportion  0.7764 0.8940 0.9457 0.96874 0.98450 0.99623 0.99884
##                            PC8
## Standard deviation     0.09646
## Proportion of Variance 0.00116
## Cumulative Proportion  1.00000

Scree diagram

fviz_eig(pca, addlabels = TRUE)

I am choosing 3 PC components after looking at the scree plot.

pca <- as.data.frame(pca$x)
pca <- pca[1:3]

pca$Gender <- abalone$Sex_binary

Train test split

set.seed(42) # set a seed value for reproducibility
train_index2 <- sample(1:nrow(pca), nrow(pca)*0.7)
train_data2 <- pca[train_index2, ]
test_data2 <- pca[-train_index2, ]

Model

logit_model2 <- glm(Gender ~ PC1 + PC2 + PC3, data=train_data2, family=binomial)

Predict

test_data2$predicted_sex <- predict(logit_model2, newdata=test_data2, type="response")

test_data2$predicted_sex_binary <- ifelse(test_data2$predicted_sex > 0.5, 1, 0)

Evaluate model

confusionMatrix(as.factor(test_data2$predicted_sex_binary), test_data2$Gender)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  93  97
##          1 300 361
##                                           
##                Accuracy : 0.5335          
##                  95% CI : (0.4993, 0.5674)
##     No Information Rate : 0.5382          
##     P-Value [Acc > NIR] : 0.6218          
##                                           
##                   Kappa : 0.0258          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.2366          
##             Specificity : 0.7882          
##          Pos Pred Value : 0.4895          
##          Neg Pred Value : 0.5461          
##              Prevalence : 0.4618          
##          Detection Rate : 0.1093          
##    Detection Prevalence : 0.2233          
##       Balanced Accuracy : 0.5124          
##                                           
##        'Positive' Class : 0               
## 
precision(test_data2$predicted_sex_binary, test_data2$Gender)
## [1] 0.7882096
recall(test_data2$predicted_sex_binary, test_data2$Gender)
## Warning in mean.default(predicted[actual == 1]): argument is not numeric or
## logical: returning NA
## [1] NA
roc1 <- roc(test_data2$Gender, test_data2$predicted_sex_binary)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc1, main = "ROC curve", print.auc = TRUE)

We are getting worse accuracy with PCA.

Logistic Regression using FA

fit.pc <- principal(abalone[, 2:9], nfactors=4, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = abalone[, 2:9], nfactors = 4, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                 RC1  RC3  RC2  RC4   h2      u2 com
## Length         0.82 0.29 0.15 0.45 0.99 0.01175 1.9
## Diameter       0.81 0.30 0.18 0.46 0.99 0.01016 2.0
## Height         0.44 0.87 0.17 0.12 1.00 0.00051 1.6
## Whole Weight   0.93 0.30 0.17 0.10 0.99 0.00871 1.3
## Shucked Weight 0.93 0.27 0.00 0.09 0.95 0.05381 1.2
## Viscera weight 0.91 0.29 0.12 0.06 0.93 0.06599 1.2
## Shell weight   0.81 0.33 0.35 0.12 0.91 0.09080 1.8
## Rings          0.13 0.12 0.98 0.05 0.99 0.00831 1.1
## 
##                        RC1  RC3  RC2  RC4
## SS loadings           4.76 1.31 1.21 0.47
## Proportion Var        0.60 0.16 0.15 0.06
## Cumulative Var        0.60 0.76 0.91 0.97
## Proportion Explained  0.61 0.17 0.16 0.06
## Cumulative Proportion 0.61 0.78 0.94 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 4 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.01 
##  with the empirical chi square  26.47  with prob <  1.8e-06 
## 
## Fit based upon off diagonal values = 1
round(fit.pc$values, 3)
## [1] 6.212 0.940 0.414 0.185 0.126 0.094 0.021 0.009
fit.pc$loadings
## 
## Loadings:
##                RC1    RC3    RC2    RC4   
## Length          0.824  0.294  0.151  0.448
## Diameter        0.806  0.305  0.182  0.462
## Height          0.444  0.872  0.171  0.117
## Whole Weight    0.927  0.303  0.174       
## Shucked Weight  0.931  0.267              
## Viscera weight  0.912  0.290  0.121       
## Shell weight    0.814  0.327  0.352  0.125
## Rings           0.125  0.119  0.979       
## 
##                  RC1   RC3   RC2   RC4
## SS loadings    4.761 1.307 1.213 0.469
## Proportion Var 0.595 0.163 0.152 0.059
## Cumulative Var 0.595 0.759 0.910 0.969
# Loadings with more digits
for (i in c(1,3,2,4)) { print(fit.pc$loadings[[1,i]])}
## [1] 0.8235491
## [1] 0.1508324
## [1] 0.293559
## [1] 0.4484296
# Communalities
fit.pc$communality
##         Length       Diameter         Height   Whole Weight Shucked Weight 
##      0.9882495      0.9898375      0.9994898      0.9912878      0.9461900 
## Viscera weight   Shell weight          Rings 
##      0.9340059      0.9092005      0.9916902
# Rotated factor scores, Notice the columns ordering: RC1, RC3, RC2 and RC4
# Play with FA utilities
fa.parallel(abalone[, 2:9]) # See factor recommendation
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  3  and the number of components =  1
fa.plot(fit.pc) # See Correlations within Factors

fa.diagram(fit.pc) # Visualize the relationship

vss(abalone[, 2:9]) # See Factor recommendations for a simple structure
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## 
## Very Simple Structure
## Call: vss(x = abalone[, 2:9])
## VSS complexity 1 achieves a maximimum of 0.97  with  1  factors
## VSS complexity 2 achieves a maximimum of 0.99  with  2  factors
## 
## The Velicer MAP achieves a minimum of 0.12  with  1  factors 
## BIC achieves a minimum of  166.96  with  4  factors
## Sample Size adjusted BIC achieves a minimum of  173.31  with  4  factors
## 
## Statistics by number of factors 
##   vss1 vss2  map dof   chisq     prob sqresid  fit RMSEA  BIC SABIC complex
## 1 0.97 0.00 0.12  20 7.7e+03  0.0e+00    1.19 0.97  0.37 7555  7618     1.0
## 2 0.94 0.99 0.15  13 5.4e+03  0.0e+00    0.46 0.99  0.38 5316  5358     1.2
## 3 0.91 0.97 0.24   7 5.8e+02 3.1e-121    0.45 0.99  0.17  525   548     1.5
## 4 0.88 0.94 0.22   2 1.8e+02  2.0e-40    0.48 0.99  0.18  167   173     1.7
## 5 0.86 0.93 0.34  -2 1.4e+01       NA    0.47 0.99    NA   NA    NA     1.9
## 6 0.87 0.94 0.45  -5 2.1e+00       NA    0.43 0.99    NA   NA    NA     1.9
## 7 0.86 0.93 1.00  -7 1.4e-04       NA    0.43 0.99    NA   NA    NA     1.9
## 8 0.86 0.93   NA  -8 1.4e-04       NA    0.43 0.99    NA   NA    NA     1.9
##    eChisq    SRMR  eCRMS eBIC
## 1 3.7e+02 4.9e-02 0.0575  216
## 2 4.7e+01 1.7e-02 0.0252  -57
## 3 1.6e+00 3.2e-03 0.0064  -54
## 4 5.3e-01 1.8e-03 0.0069  -15
## 5 1.2e-02 2.8e-04     NA   NA
## 6 8.7e-04 7.4e-05     NA   NA
## 7 5.9e-08 6.1e-07     NA   NA
## 8 5.9e-08 6.1e-07     NA   NA
fit.pc <- as.data.frame(fit.pc$scores)
fit.pc <- fit.pc[1:2]

fit.pc$Gender <- abalone$Sex_binary
fit.pc$Rings <- abalone$Rings

Train test split

set.seed(42) # set a seed value for reproducibility
train_index3 <- sample(1:nrow(fit.pc), nrow(fit.pc)*0.7)
train_data3 <- fit.pc[train_index3, ]
test_data3 <- fit.pc[-train_index3, ]

Model

logit_model3 <- glm(Gender ~ RC1 + RC3, data=train_data3, family=binomial)

Predict

test_data3$predicted_sex <- predict(logit_model3, newdata=test_data3, type="response")

test_data3$predicted_sex_binary <- ifelse(test_data3$predicted_sex > 0.5, 1, 0)

Evaluate model

confusionMatrix(as.factor(test_data3$predicted_sex_binary), test_data3$Gender)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  56  62
##          1 337 396
##                                          
##                Accuracy : 0.5311         
##                  95% CI : (0.497, 0.5651)
##     No Information Rate : 0.5382         
##     P-Value [Acc > NIR] : 0.6728         
##                                          
##                   Kappa : 0.0075         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.1425         
##             Specificity : 0.8646         
##          Pos Pred Value : 0.4746         
##          Neg Pred Value : 0.5402         
##              Prevalence : 0.4618         
##          Detection Rate : 0.0658         
##    Detection Prevalence : 0.1387         
##       Balanced Accuracy : 0.5036         
##                                          
##        'Positive' Class : 0              
## 
precision(test_data3$predicted_sex_binary, test_data3$Gender)
## [1] 0.8646288
recall(test_data3$predicted_sex_binary, test_data3$Gender)
## Warning in mean.default(predicted[actual == 1]): argument is not numeric or
## logical: returning NA
## [1] NA
roc2 <- roc(test_data3$Gender, test_data3$predicted_sex_binary)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc2, main = "ROC curve", print.auc = TRUE)

FA also does not give us promising results for logistic regression to predict gender. Dimentionality reduction techniques did not make the classification model better. PCA and FA both failed to perform.

  1. Linear regression model is decent with a MAPE accuracy of 15%. The residual analysis shows that there are discernable patterns in the data even after taking a log transformation of the dataset. the qq plot shows that there is a tail indicating that it does not follow a perfect normal distribution.

Attributes proved valuable in predicting the age are: 1. Shell weight 2. Height 3. Diameter There is also rings with a 1.00 correlation because it is just rings + 1.5 so that does not count.

  1. The MAPE accuracy on the Rings column is 15%.
  2. We can predict the gender but it is barely better than a coin flip. I have used various accuracy measures like AUC, ROC and confusion matrix. Logistic regression accuracy: 55.58% LDA: 55.93%
  1. No, it dimention reduction does not improve our accuracy. PCA: 53.35% FA: 53.11%